Analyse des données RNAseq DepMap (CCLE ; cancer) et des 57 epigenomes de Roadmap

# W O R K I N G  D I R E C T O R Y
#main.dir = "/shared/projects/chrom_enhancer_bc"
main.dir = file.path("/home/antonin/Documents/stage_2022/git/chrom_enhancer_bc/")
setwd(main.dir)

#data.dir = file.path(main.dir,"data/expression/rdata")
data.dir = "/home/antonin/Documents/stage_2022/data/expression/rdata/"


# I M P O R T
library(pheatmap)
library(ggpubr)
library(sva)
library(edgeR)
library(DESeq2)
library(DT)
library(stringr)
library(dplyr)
library(reshape2)
library(FactoMineR)
library(factoextra)
library(grid)
library(cowplot)
library(GGally)
source("./scripts/Normalization.R")
load(file.path(data.dir,"cancer_healthy_expression.RData"))

# V A R I A B L E S

of_interest = c(
  "POU5F1" ="ENSG00000204531",
  "MYC" =   "ENSG00000136997",
  "POLR3GL"="ENSG00000121851",
  "POLR3G" ="ENSG00000113356",
  "MEF2C" = "ENSG00000081189",
  "CETN3" = "ENSG00000153140",
  "MBLAC2" ="ENSG00000176055",
  "LYSMD3" ="ENSG00000176018",
  "ADGRV1" ="ENSG00000164199")

polr3g_full_name = expr$ENSEMBL_id[grepl(pattern = "ENSG00000113356", expr$ENSEMBL_id)]

# Names of the cell_lines
cell_lines = names(expr)[! names(expr) %in% c("gene_name","ENSEMBL_id")]

cell_of_interest = c("MDAMB231","MDAMB436","HCC1937", "HCC1806",
                "HCC1954","MDAMB468","K562")
color_cell = ifelse(grepl(paste(cell_of_interest,collapse = "|"),  cell_lines ), yes = "red", no ="black")

# F U N C T I O N S

# Centering vector by the median
MedianCentering <- function(x){
  (x - median(x))
}

# from a matrix, returns a list of 2 elements : continuous breaks and their associated colors 
get_color_scale_pheatmap = function(matrix,ncolors = 50,threshold = 1){
  range <- max(abs(matrix))
  myBreaks = seq(-range, range, length.out = ncolors)
  myBreaks[myBreaks > -threshold  &  myBreaks < threshold ] = 0
  myBreaks = unique(myBreaks)
  
  paletteLength <- length(myBreaks)
  myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength-1)
  return(list("colorscale" = myColor, "breaks" = myBreaks)) } 

# scatter plot with highlighted genes
plot_cell_lines = function(count_matrix, highlight_gene, cell_lines = "K562"){
  if (cell_lines == "K562") {
    df =  select(count_matrix, K562_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,K562)
    plot = ggplot(df, aes(x = K562_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE, y= K562 )) + geom_point()
    
    max_y = max(df[["K562"]])
  }
  else if (cell_lines == "A549"){
    df =  select(count_matrix, A549_LUNG,A549)
    plot = ggplot(df, aes(x = A549_LUNG, y= A549 )) + geom_point()
    
    max_y = max(df[["A549"]])
  }
   plot2 = plot +
    geom_point(data = df[highlight_gene,], 
              size = 3,
              aes(color = "POLR3G"))+
    geom_smooth(method = "lm", color = "#3366FF", aes(color = "Regression_line") ) +
    stat_regline_equation(label.y = 1.1*max_y, aes(label = ..eq.label..)) +
    stat_regline_equation(label.y = max_y, aes(label = ..rr.label..)) 
    
   return (plot2 + scale_color_manual(name='Legend',
                       breaks=c('POLR3G', 'Regression_line'),
                       values=c('POLR3G'='red', 'Regression_line'="#3366FF")))
}

# side by side plot for 2 pairs of cell lines A549 - K562
Multi_plot_A549_K562 = function(count_matrix, highlight_gene, title){
  k562_plot = plot_cell_lines(count_matrix, 
                highlight_gene = highlight_gene,
                cell_lines = "K562") +
                theme(legend.position="none") +
                ggtitle(paste(title,"K562"))
  
  a549_plot = plot_cell_lines(count_matrix, 
                highlight_gene = highlight_gene,
                cell_lines = "A549") + 
                ggtitle(paste(title,"A549"))
  
  return( plot_grid(k562_plot, a549_plot, labels = "AUTO", rel_widths = c(9,10))) 
}

# Boxplot from a 3 column DF : reshape2::melt(DF) + point for POLR3G ENS ID
expression_plot = function(melted_df){
plot = ggplot(data = melted_df, aes(x=Var2, y=value, color = condition)) + 
  geom_boxplot()   +
  geom_text(aes(x=Var2,
                label=ifelse(grepl(polr3g_full_name,Var1),".",''))
            , size= 10, color = "black") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, colour = color_cell),
          axis.text=element_text(size=6))  +
  xlab("cell_lines") + ylab("normalized_expression")
  return(plot) }

# from a matrix, extract the rows with the maximum IQR. IQR superior to the quantiles 0.75 (by default)
extract_top_varying_genes = function(matrix, threshold=0.75){
  IQRs = data.frame("IQR" = apply(matrix, 1, IQR))
  IQRs$IQR = as.numeric(IQRs$IQR)
  hist(IQRs$IQR)
  print("threshold :") 
  print(unname(quantile(IQRs$IQR,threshold)))
  genes = rownames(filter(IQRs, IQRs$IQR> quantile(IQRs$IQR,threshold)))
  message("Top ",length(genes)," most varying genes were taken for the PCA")
  return(genes )
}
# D A T A . P R O C E S S

# Processing of genes of interest
# Change row names, and remove uninteresting columns
rows_of_interest = expr[expr$gene_name  %in% names(of_interest) ,]
rows_of_interest$gene_name = NULL
row.names(rows_of_interest) = rows_of_interest$ENSEMBL_id
rows_of_interest$ENSEMBL_id = NULL


# Processing of count matrix
# Avoid error : convert into numeric value
expr <- cbind("ENSEMBL_id"=expr$ENSEMBL_id, 
              mutate_all(select(expr,all_of(cell_lines)), function(x) as.numeric(as.character(x))))

# Remove duplicated id
expr <- expr %>%
  filter(! duplicated(ENSEMBL_id))

row.names(expr) = expr$ENSEMBL_id
expr$ENSEMBL_id = NULL


# Replace 0 by NA (simplify computation step)
expr[expr == 0] = NA

# differentiate CCLE / RoadMap datasets
CCLE_dataset = expr[, 1:20]
RoadMap_dataset = expr[, 21:length(colnames(expr))]

CCLE = colnames(CCLE_dataset)
ROADMAP = colnames(RoadMap_dataset)[-1]

# Filtering rows having more than 75% of 0 / NA
non_null_CCLE = CCLE_dataset %>%
  filter(rowMeans(is.na(.)) < 0.25)

non_null_RoadMap = RoadMap_dataset %>%
  filter(rowMeans(is.na(.)) < 0.25)

# Retrieving rows that pass filters
to_keep = intersect(row.names(non_null_CCLE) , row.names(non_null_RoadMap))

# Filter to keep genes that pass filters 
expr = expr[to_keep, ]
expr[is.na(expr)] = 0


# Using the edgeR filterByExpr function
design = c(rep("CCLE", 20 ), rep("RoadMap",length(cell_lines)-20))
edgeR =  DGEList(counts = expr, group = factor(design))
keep = filterByExpr(edgeR, group = design)
rows_to_keep = row.names(edgeR[keep,]$counts)

# add genes of interest (if they were removed)
rows_to_keep = unique(c(rows_to_keep, row.names(rows_of_interest)))

filtered_expr = expr[rows_to_keep, ]
filtered_expr[is.na(filtered_expr)] = 0

DT::datatable(head(filtered_expr))

Apply DESeq2 Normalisation

count.matrix = data.matrix(filtered_expr)

# normalization
DESeq2_matrix = tools.norm.RNAseq(count.matrix, tool = "vst2", design = design ) 
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# we remove "Universal_human_reference" (irrelevant)
DESeq2_matrix = DESeq2_matrix[, colnames(DESeq2_matrix)!="Universal_Human_Reference"]
count.matrix = count.matrix[, colnames(count.matrix)!="Universal_Human_Reference"]

design = design[1:length(design)-1]
cell_lines = colnames(DESeq2_matrix)

Raw plot : CCLE A549 + K562 vs RoadMap A549 + K562

k562 = colnames(count.matrix)[grepl("K562",colnames(count.matrix))] 
a549 = colnames(count.matrix)[grepl("A549",colnames(count.matrix))]

common_cells= c(k562, a549)

Multi_plot_A549_K562(data.frame(log2(count.matrix+1)), polr3g_full_name, "Raw counts (log2(x)+1)") 

# DESeq2 plot : CCLE A549 + K562 vs RoadMap A549 + K562

Multi_plot_A549_K562(as.data.frame(DESeq2_matrix), polr3g_full_name, "Raw counts (log2(x)+1)") 

# Expression boxplot : Raw

# Raw + log2
melt_raw = reshape2::melt(log2(count.matrix+1))
condition = ifelse(melt_raw$Var2 %in% CCLE, yes = "CCLE", no = "RoadMap")


expression_plot(melt_raw) + ggtitle("Expression boxplot : Raw expression (log2+1)")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

Expression Boxplot : Normalised data

melt_deseq2 = reshape2::melt(DESeq2_matrix)


expression_plot(melt_deseq2) + ggtitle("Expression boxplot : Normalised data (DESeq2)")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

PCA : Raw

raw_matrix = data.frame(log2(count.matrix+1))
pca_subset = raw_matrix[extract_top_varying_genes(raw_matrix, 0.5) , ]
## [1] "threshold :"
## [1] 2.13843
## Top 6975 most varying genes were taken for the PCA

# Highlight A549 and K562 cell lines
design_PCA = ifelse(grepl("K562", cell_lines), yes = "K562", no = design)
design_PCA = ifelse(grepl("A549", cell_lines), yes = "A549", no = design_PCA)

res.pca = PCA(t(pca_subset), graph = F)
fviz_pca_ind(res.pca, 
             col.ind = design_PCA,
             repel = TRUE,
             palette = "Set1",
             ggrepel.max.overlaps = 1) + labs(title ="PCA raw matrix",)

PCA - DESeq2

DESeq2_df = data.frame(DESeq2_matrix)
pca_subset = DESeq2_df[extract_top_varying_genes(DESeq2_df, 0.7) , ]
## [1] "threshold :"
## [1] 1.926973
## Top 4185 most varying genes were taken for the PCA

res.pca = PCA(t(pca_subset), graph = F)
fviz_pca_ind(res.pca, 
             col.ind = design_PCA,
             repel = TRUE,
             palette = "Set1",
             ggrepel.max.overlaps = 1) + labs(title ="PCA - DESeq2",)

Heatmap d’expression de la région autour de POLR3G

# VARIABLES
TNBC = c("MDAMB231","MDAMB436","HCC1937", "HCC1806",
         "HCC1954","MDAMB468")

of_interest = c(
  "POU5F1" ="ENSG00000204531",
  "MYC" =   "ENSG00000136997",
  "POLR3GL"="ENSG00000121851",
  "POLR3G" ="ENSG00000113356",
  "MEF2C" = "ENSG00000081189",
  "CETN3" = "ENSG00000153140",
  "MBLAC2" ="ENSG00000176055",
  "LYSMD3" ="ENSG00000176018",
  "ADGRV1" ="ENSG00000164199")
of_interest = data_frame("ENS_id" = of_interest, "gene_name" = names(of_interest))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# Filtering matrix with only gene of interests
tmp = filter(melt_deseq2, grepl(paste(of_interest$ENS_id,collapse="|"),melt_deseq2$Var1))
tmp$copie = substr(tmp$Var1,1,15)


# merge dataframes to correctly match ENS_ID with gene name
to_heatmap = select(merge(tmp,of_interest, by.x = "copie", by.y = "ENS_id"),
                    gene_name,Var2 ,value)
colnames(to_heatmap) = c("gene_name","cell_line","expression")

# melt ==> classical matrix
heat = reshape2::dcast(to_heatmap, gene_name~cell_line)
## Using expression as value column: use value.var to override.
row.names(heat) = heat$gene_name
heat$gene_name = NULL



# Median centering
heat_norm = t(apply(heat, 1, MedianCentering))

# Create color scale
scale = get_color_scale_pheatmap(heat_norm,ncolors = 50,threshold = 0.5)

# Specify which cell lines are of interest
annotation_heatmap = data.frame("data_source" = design, row.names = cell_lines)
color = c("red","blue")
names(color) = c("CCLE","RoadMap")

annotation_heatmap$state = ifelse(
  grepl(paste(TNBC,collapse="|") ,row.names(annotation_heatmap)), 
  yes = "TNBC", no = "other")

annotation_heatmap$state = ifelse(
  grepl("K562" ,row.names(annotation_heatmap)), 
  yes = "K562", no = annotation_heatmap$state)

annotation_heatmap$state = ifelse(
  grepl("A549" ,row.names(annotation_heatmap)), 
  yes = "A594", no = annotation_heatmap$state)

# rotate and add colors for TNBC - A549 - K562
heatmap = data.frame(t(heat_norm))
col = ifelse(grepl(paste(TNBC,collapse="|") ,row.names(heatmap)), 
                        yes = "red", no = "darkgrey")

col = ifelse(grepl("K562" ,row.names(heatmap)), 
                        yes = "blue", no = col)

col = ifelse(grepl("A549" ,row.names(heatmap)), 
                        yes = "darkgreen", no = col)

heatmap$colors = col

# Ordering rows by their POLR3G expression
heatmap = heatmap[order(-heatmap$POLR3G) , ]

# Ordering columns
heatmap = heatmap[,c("CETN3","MBLAC2","LYSMD3","MYC","POLR3GL","ADGRV1","MEF2C","POLR3G","colors")]


map = pheatmap(select(heatmap,-colors),
         color = scale$colorscale,
         breaks = scale$breaks,
         cluster_rows = F, cluster_cols = F,
         angle_col = 0,
         fontsize_row = 5,
         main = "Expression heatmap : DESeq2 ")

# Change the colors of row names
map$gtable$grobs[[4]]$gp=gpar(col= heatmap$colors, fontsize = 5)


print(map) 

# GGpairs :

# Change the format to be plotted : select the gens of interest
tmp = filter(DESeq2_df, grepl(paste(of_interest$ENS_id,collapse="|"),substr(row.names(DESeq2_df),1,15) ))
tmp$ENS_id = substr(rownames(tmp),1,15)
tmp = merge(tmp, of_interest, by = "ENS_id")
row.names(tmp) = tmp$gene_name
to_be_plotted = select(tmp, -gene_name, -ENS_id)


ggpairs(as.data.frame(t(to_be_plotted)),
        mapping = ggplot2::aes(color =design),
        lower = list(continuous = wrap("smooth", alpha = 0.7, size=1)))